# orig data missing???
SUB <- read_csv("data-raw/geo/suburb-and-adjoining-suburb-november-2019.zip") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
select(suburb_name) %>%
rename(SSC_NAME16 = suburb_name) %>%
distinct() %>%
mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>%
arrange(SSC_NAME16)
write_rds(SUB, "data/geo/clean/SUB.Rds")This might very well include areas with no pops (and therefore no SEIFA), for instance:
# A tibble: 2 x 1
SSC_NAME16
<chr>
1 Brisbane Airport
2 Port Of Brisbane
Names from Brisbane containing (Brisbane - Qld), names from Qld containing (Qld) have to be cleaned to match BCC data.
Mcdowall is called McDowall and Mount Coot-tha is Mount Coot-Tha - these have been unified as well.
unzip("data-raw/geo/1270055003_ssc_2016_aust_shape.zip",
exdir = "data-raw/geo")
SSC <- st_read("data-raw/geo/1270055003_ssc_2016_aust_shape/SSC_2016_AUST.shp",
stringsAsFactors = FALSE) %>%
mutate(SSC_CODE16 = as.integer(SSC_CODE16)) %>%
select(-STE_NAME16, -STE_CODE16, -AREASQKM16) %>%
st_transform(3112) %>%
filter(!st_is_empty(geometry)) %>%
mutate(SSC_NAME16_orig = SSC_NAME16) %>%
mutate(SSC_NAME16 = str_remove(SSC_NAME16,
fixed(" (Brisbane - Qld)"))) %>%
mutate(SSC_NAME16 = str_remove(SSC_NAME16,
fixed(" (Qld)"))) %>%
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "McDowall",
"Mcdowall", SSC_NAME16)) %>%
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Mount Coot-tha",
"Mount Coot-Tha", SSC_NAME16))
# SSC <- rmapshaper::ms_simplify(SSC, keep = 0.05, weighting = 0.7) # default settings
write_rds(SSC, "data/geo/SSC_2016_AUST.Rds")
unlink("data-raw/geo/1270055003_ssc_2016_aust_shape", recursive = TRUE)Stones Corner doesn’t exist in ABS but it does in BCC. It seems it’s part of Greenslopes.
# A tibble: 1 x 3
SSC_NAME16 SSC_CODE16 SSC_NAME16_orig
<chr> <int> <chr>
1 Stones Corner NA <NA>
After clean the datasets were merged.
Full map
SEIFA <- read_xls("data-raw/SEIFA/2033055001 - ssc indexes.xls",
sheet = "Table 1", skip = 5, n_max = 13719, na = "-") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
dplyr::rename(SSC_CODE16 = x1,
SSC_NAME16 = x2,
IRSD = score_3,
IRSD_d = decile_4,
IRSAD = score_5,
IRSAD_d = decile_6,
IER = score_7,
IER_d = decile_8,
IEO = score_9,
IEO_d = decile_10,
URP = x11,
caution = x12) %>%
# deciles kept as integers here
mutate(SSC_CODE16 = as.integer(SSC_CODE16),
IRSD = as.integer(IRSD),
IRSAD = as.integer(IRSAD),
IER = as.integer(IER),
IEO = as.integer(IEO),
IRSD_d = as.integer(IRSD_d),
IRSAD_d = as.integer(IRSAD_d),
IER_d = as.integer(IER_d),
IEO_d = as.integer(IEO_d),
URP = as.integer(URP)
) %>%
mutate(caution = as.logical(ifelse(is.na(caution), "False", "True")))Comparing Australia & Brisbane, example of IRSD.
x <integer>
# total N=13713 valid N=13691 mean=5.50 sd=2.87
Value | N | Raw % | Valid % | Cum. %
---------------------------------------
1 | 1369 | 9.98 | 10.00 | 10.00
2 | 1369 | 9.98 | 10.00 | 20.00
3 | 1370 | 9.99 | 10.01 | 30.01
4 | 1370 | 9.99 | 10.01 | 40.01
5 | 1367 | 9.97 | 9.98 | 50.00
6 | 1370 | 9.99 | 10.01 | 60.00
7 | 1369 | 9.98 | 10.00 | 70.00
8 | 1369 | 9.98 | 10.00 | 80.00
9 | 1371 | 10.00 | 10.01 | 90.02
10 | 1367 | 9.97 | 9.98 | 100.00
<NA> | 22 | 0.16 | <NA> | <NA>
x <integer>
# total N=193 valid N=184 mean=7.79 sd=2.37
Value | N | Raw % | Valid % | Cum. %
-------------------------------------
1 | 3 | 1.55 | 1.63 | 1.63
2 | 6 | 3.11 | 3.26 | 4.89
3 | 5 | 2.59 | 2.72 | 7.61
4 | 5 | 2.59 | 2.72 | 10.33
5 | 13 | 6.74 | 7.07 | 17.39
6 | 12 | 6.22 | 6.52 | 23.91
7 | 28 | 14.51 | 15.22 | 39.13
8 | 18 | 9.33 | 9.78 | 48.91
9 | 33 | 17.10 | 17.93 | 66.85
10 | 61 | 31.61 | 33.15 | 100.00
<NA> | 9 | 4.66 | <NA> | <NA>
Few areas with missing SEIFA
SSC_NAME16 IRSD_d IRSAD_d IER_d IEO_d URP
1 Banks Creek NA NA NA NA NA
2 Brisbane Airport NA NA NA NA NA
3 Eagle Farm NA NA NA NA NA
4 Enoggera Reservoir NA NA NA 10 25
5 Karawatha NA NA NA NA NA
6 Larapinta NA NA NA NA NA
7 Lytton NA NA NA NA NA
8 Mount Coot-Tha NA NA NA NA NA
9 Port Of Brisbane NA NA NA NA NA
These were excluded.
Few cases with ABS flag caution.
SSC_NAME16 IRSD IRSD_d IRSAD IRSAD_d IER IER_d IEO IEO_d URP
1 Bulwer 1014 6 996 6 1012 5 1024 7 49
2 Cowan Cowan 1014 6 996 6 1012 5 1024 7 28
3 England Creek 1004 5 978 5 1063 8 949 3 33
4 Kooringal 1014 6 996 6 1012 5 1024 7 45
Usually with very small pop numbers.
These remain included.
Original values of indices were used to calculate ‘local deciles’ using SSCs for Brisbane only
SSC %<>%
mutate(IRSD_d_orig = IRSD_d,
IRSD_d = ntile(IRSD, 10),
IRSAD_d_orig = IRSAD_d,
IRSAD_d = ntile(IRSAD, 10),
IER_d_orig = IER_d,
IER_d = ntile(IER, 10),
IEO_d_orig = IEO_d,
IEO_d = ntile(IEO, 10)) %>%
mutate(across(ends_with("_d"), factor)) %>%
mutate(across(ends_with("_d_orig"), factor)) %>%
relocate(IRSD_d_orig, .after = IRSD_d) %>%
relocate(IRSAD_d_orig, .after = IRSAD_d) %>%
relocate(IER_d_orig, .after = IER_d) %>%
relocate(IEO_d_orig, .after = IEO_d)
write_rds(SSC, "data/geo/SSC.Rds")Note: Deciles are represented as factor variable. Certain application could benefit from changing it to ordered?
Agreement across specific indices :
IRSD_dIRSD_d_orig == IRSD_d <lgl>
# total N=184 valid N=184 mean=0.11 sd=0.32
Value | N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 163 | 88.59 | 88.59 | 88.59
TRUE | 21 | 11.41 | 11.41 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
IRSAD_dIRSAD_d_orig == IRSAD_d <lgl>
# total N=184 valid N=184 mean=0.11 sd=0.32
Value | N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 163 | 88.59 | 88.59 | 88.59
TRUE | 21 | 11.41 | 11.41 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
IER_dIER_d_orig == IER_d <lgl>
# total N=184 valid N=184 mean=0.71 sd=0.45
Value | N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 53 | 28.80 | 28.80 | 28.80
TRUE | 131 | 71.20 | 71.20 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
IEO_dIEO_d_orig == IEO_d <lgl>
# total N=184 valid N=184 mean=0.12 sd=0.33
Value | N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 162 | 88.04 | 88.04 | 88.04
TRUE | 22 | 11.96 | 11.96 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
dog_cost <- read_xlsx("data-raw/costs/dog_expensive.xlsx") %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
select(-web_source) %>%
rename(dog_breed = breed) %>%
select(dog_breed) %>%
distinct() %>%
# correcting names for better matching - these one used in BCE
mutate(
dog_breed = case_when(
dog_breed == "Hairless Chinese Crested" ~ "Chinese Crested Dog",
dog_breed == "Saint Bernard" ~ "St Bernard",
TRUE ~ as.character(dog_breed))
) %>%
# synonyms
add_row(dog_breed = "Dogue de Bordeaux") %>%
add_row(dog_breed = "Bulldog") %>%
add_row(dog_breed = "British Bulldog") %>%
mutate(expensive = "yes") %>%
arrange(dog_breed)Top 20 most expensive dogs (+3 synonyms!)
Data scraped from https://top10petinsurance.com.au/pet-insurance-prices on the 30th March 2020
# packages needed
# install.packages("rvest")
p_load(rvest, tidyverse)
# scraping table done with this using chrome: https://www.r-bloggers.com/using-rvest-to-scrape-an-html-table/
url <- 'https://top10petinsurance.com.au/pet-insurance-prices/'
dog_insurance <- url %>%
xml2::read_html() %>%
html_nodes(xpath='//*[@id="post-1016"]/div/table') %>%
html_table()
dog_insurance <- dog_insurance[[1]]
head(dog_insurance)
write_rds(dog_insurance, "data-raw/costs/dog_insurance.Rds") # extracted on the 30 March 2020dog_insurance <- read_rds("data-raw/costs/dog_insurance.Rds") %>%
as_tibble() %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
# select(-) %>%
mutate(average_accident_policy_cost_annual =
gsub(",", "",
average_accident_policy_cost_annual,
fixed = TRUE),
average_illness_policy_cost_annual =
gsub(",", "",
average_illness_policy_cost_annual,
fixed = TRUE),
average_comprehensive_policy_cost_annual =
gsub(",", "",
average_comprehensive_policy_cost_annual,
fixed = TRUE)
) %>%
mutate(average_accident_policy_cost_annual =
as.numeric(gsub("$", "",
average_accident_policy_cost_annual,
fixed = TRUE)),
average_illness_policy_cost_annual =
as.numeric(gsub("$", "",
average_illness_policy_cost_annual,
fixed = TRUE)),
average_comprehensive_policy_cost_annual =
as.numeric(gsub("$", "",
average_comprehensive_policy_cost_annual,
fixed = TRUE))
)cost_compared_to_other_breeds <character>
# total N=558 valid N=558 mean=1.97 sd=0.42
Value | N | Raw % | Valid % | Cum. %
------------------------------------------------------------
Above average | 58 | 10.39 | 10.39 | 10.39
Below average | 457 | 81.90 | 81.90 | 92.29
Significantly above average | 43 | 7.71 | 7.71 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
Breeds in Significantly above average category:
Breeds in Above average category:
Raw dataset named cars-bis-open-data-animal-permits-3-jan-2019 consists of 107,405 records.
Excluding records with:
permit_name: Breeders Permit, Cattery Permit, Racehorses Permit, Pet Shop Permit, Domestic Dog Permit & Guard Dog Permitdog_breed valuesdog_breed listed as Unknown or CrossStones Corner values were assigned to Greenslopes SSC (see information in section above).
dog_ownership %<>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
# all the same here
select(-permit_group, -permit_status) %>%
# special permits?
filter(!permit_name %in% c("Breeders Permit", "Cattery Permit", "Racehorses Permit", "Pet Shop Permit")) %>%
filter(!permit_name %in% c("Domestic Dog Permit", "Guard Dog Permit")) %>%
rename(dog_breed = animal_breed,
SSC_NAME16 = application_location_suburb) %>%
mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>%
# correct suburb
mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Stones Corner", "Greenslopes", SSC_NAME16)) %>%
# missing geo
filter(!is.na(SSC_NAME16)) %>%
# missing breed
filter(!is.na(dog_breed)) %>%
filter(!dog_breed %in% c("Unknown",
"Medium Cross Breed", "Small Cross Breed", "Large Cross Breed")) %>%
# few cleans for better matches
mutate(
dog_breed = case_when(
dog_breed == "German Shepherd Dog (Long Stock Coat)" ~ "German Shepherd",
dog_breed == "German Shepherd Dog" ~ "German Shepherd",
dog_breed == "Central Asian Shepherd Dog " ~ "Central Asian Shepherd",
dog_breed == "Kangal Dog" ~ "Kangal",
dog_breed == "Bulldog" ~ "British Bulldog",
dog_breed == "Collie (Rough)" ~ "Rough Collie",
dog_breed == "Collie (Smooth)" ~ "Smooth Collie",
TRUE ~ as.character(dog_breed))
) Prepared dataset consists of 106,018 records.
dog_costdog_ownership_cost <- left_join(dog_ownership, dog_cost, by = "dog_breed") %>%
# binary indicator "expensive" and "non-expensive" dog breeds according to dog_cost
mutate(
expensive = case_when(
is.na(expensive) ~ 0,
expensive == "yes" ~ 1
)
)Expensive dogs in Brisbane:
expensive <numeric>
# total N=106018 valid N=106018 mean=0.14 sd=0.35
Value | N | Raw % | Valid % | Cum. %
----------------------------------------
0 | 90940 | 85.78 | 85.78 | 85.78
1 | 15078 | 14.22 | 14.22 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
dog_insuranceCorrecting names for better match
dog_insurance %<>%
mutate(
dog_breed = case_when(
# this is a bit tricky! might need sensitivity?
dog_breed == "French Poodle" ~ "Poodle",
# just naming issues
dog_breed == "Poodle – Standard" ~ "Poodle (Standard)",
dog_breed == "Miniature Poodle" ~ "Poodle (Miniature)",
dog_breed == "Poodle – Toy" ~ "Poodle (Toy)",
dog_breed == "Shar-Pei" ~ "Shar Pei",
dog_breed == "German Short Haired Pointer" ~ "German Shorthaired Pointer",
dog_breed == "German Wire Haired Pointer" ~ "German Wirehaired Pointer",
dog_breed == "Collie – Rough" ~ "Rough Collie",
dog_breed == "Collie – Smooth" ~ "Smooth Collie",
dog_breed == "Miniature Schnauzer" ~ "Schnauzer (Miniature)",
dog_breed == "Schnauzer Giant" ~ "Schnauzer (Giant)",
dog_breed == "Lagotto Rom" ~ "Lagotto Romagnolo",
dog_breed == "Brittany Spaniel" ~ "Brittany",
dog_breed == "Staghound" ~ "Stag Hound",
dog_breed == "Kerry Blue" ~ "Kerry Blue Terrier",
dog_breed == "English Toy terrier" ~ "English Toy Terrier",
dog_breed == "Parson Jack Russell Terrier" ~ "Parson Russell Terrier",
dog_breed == "Welsh Corgi – Pembroke" ~ "Welsh Corgi (Pembroke)",
dog_breed == "American Cocker Spaniel" ~ "Cocker Spaniel (American)",
dog_breed == "Basset Fauve De Bretagne" ~ "Basset Fauve de Bretagne",
dog_breed == "Norwegian Elk Hound" ~ "Norwegian Elkhound",
dog_breed == "Cheasapeake Bay Retriever" ~ "Chesapeake Bay Retriever",
dog_breed == "Bouvier Des Flandres" ~ "Bouvier des Flandres",
dog_breed == "Miniature Bull Terrier" ~ "Bull Terrier (Miniature)",
dog_breed == "Munsterlander – Large" ~ "Large Munsterlander",
dog_breed == "Welsh Corgi – Cardigan" ~ "Welsh Corgi (Cardigan)",
dog_breed == "HamiltonStovare" ~ "Hamiltonstovare",
dog_breed == "Blue tick Coonhound" ~ "Bluetick Coonhound",
dog_breed == "Japanese Akita" ~ "Akita (Japanese)",
dog_breed == "Dogue De Bordeaux" ~ "Dogue de Bordeaux",
dog_breed == "Italian Cane Corso" ~ "Italian Corso Dog",
# different kelpies but same category anyway
dog_breed == "Australian Kelpie Sheepdog" ~ "Australian Kelpie",
# typo
dog_breed == "Neopolitan Mastiff" ~ "Neapolitan Mastiff",
TRUE ~ as.character(dog_breed)))Dog breeds without match
Some further corrections still possible here:
dog_ownership_cost <- left_join(dog_ownership_cost,
dog_insurance %>%
select(dog_breed, cost_compared_to_other_breeds)
) %>%
mutate(
cost_compared_to_other_breeds = case_when(
# all the same
dog_breed == "Fox Terrier" ~ "Below average",
dog_breed == "Fox Terrier (Smooth)" ~ "Below average",
dog_breed == "Fox Terrier (Wire)" ~ "Below average",
dog_breed == "Schnauzer" ~ "Below average",
dog_breed == "Australian Stumpy Tail Cattle Dog" ~ "Below average",
dog_breed == "Foxhound" ~ "Below average",
dog_breed == "White Swiss Shepherd Dog" ~ "Below average",
# multiple options here, but all above going for conservative
dog_breed == "Welsh Corgi" ~ "Above average",
# taking values from Chihuahua
dog_breed == "Chihuahua (Smooth Coat)" ~ "Below average",
dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
# naming - Dogue De Bordeaux
dog_breed == "French Mastiff" ~ "Significantly above average",
TRUE ~ as.character(cost_compared_to_other_breeds))) %>%
select(-permit_name)Few things left:
Two largest groups:
These observations will remain as NAs.
wide_all_n <- dog_ownership_cost %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
pivot_wider(names_from = dog_breed,
values_from = n) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_all_n <- right_join(SSC, wide_all_n)
wide_all_p <- dog_ownership_cost %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(SSC_NAME16) %>%
mutate(N = n()) %>%
ungroup() %>%
mutate(p = n/N) %>%
select(-n, -N) %>%
pivot_wider(names_from = dog_breed,
values_from = p) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_all_p <- right_join(SSC, wide_all_p)wide_cost_n <- dog_ownership_cost %>%
filter(expensive == 1) %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
pivot_wider(names_from = dog_breed,
values_from = n) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_cost_n <- right_join(SSC, wide_cost_n)
wide_cost_p <- dog_ownership_cost %>%
filter(expensive == 1) %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(SSC_NAME16) %>%
mutate(N = n()) %>%
ungroup() %>%
mutate(p = n/N) %>%
select(-n, -N) %>%
pivot_wider(names_from = dog_breed,
values_from = p) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_cost_p <- right_join(SSC, wide_cost_p)wide_insurance_n <- dog_ownership_cost %>%
filter(cost_compared_to_other_breeds != "Below average") %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
pivot_wider(names_from = dog_breed,
values_from = n) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_insurance_n <- right_join(SSC, wide_insurance_n)
wide_insurance_p <- dog_ownership_cost %>%
filter(cost_compared_to_other_breeds != "Below average") %>%
group_by(SSC_NAME16, dog_breed) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(SSC_NAME16) %>%
mutate(N = n()) %>%
ungroup() %>%
mutate(p = n/N) %>%
select(-n, -N) %>%
pivot_wider(names_from = dog_breed,
values_from = p) %>%
mutate_all(~replace(., is.na(.), 0)) %>%
clean_names() %>%
rename(SSC_NAME16 = ssc_name16)
wide_insurance_p <- right_join(SSC, wide_insurance_p)R version 4.1.2 (2021-11-01) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 18363) Matrix products: default attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] tmap_3.3-2 sf_1.0-5 gmodels_2.18.1 DT_0.20 [5] sjPlot_2.8.10 sjmisc_2.8.9 janitor_2.1.0 scales_1.1.1 [9] magrittr_2.0.1 readxl_1.3.1 forcats_0.5.1 stringr_1.4.0 [13] dplyr_1.0.7 purrr_0.3.4 readr_2.1.1 tidyr_1.1.4 [17] tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1 pacman_0.5.1To cite R in publications use:
R Core Team (2021). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/.
To cite the ggplot2 package in publications use:Wickham H (2016). ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. ISBN 978-3-319-24277-4, https://ggplot2.tidyverse.org.